import evaluate

bleu = evaluate.load("bleu")

groundtruth = [line.rstrip() for line in open("groundtruthfile.txt", "r").readlines()]
normallines = [line.split("<|endoftext|>")[-2].split("\\n<|assistant|>\\n")[-1] for line in open("normal_olmo_raw.txt", "r").readlines()]
perturbedlines = [line.split("<|endoftext|>")[-2].split("\\n<|assistant|>\\n")[-1] for line in open("perturbed_olmo_raw.txt", "r").readlines()]

results = bleu.compute(predictions=perturbedlines, references=groundtruth)

for line in perturbedlines:
    print(line)

print("BLEU score:", results["bleu"])